data("diamonds")
## Warning in data("diamonds"): data set 'diamonds' not found
library(ggplot2)
ggplot(data = diamonds, mapping = aes(x = carat, y = price)) +
scale_x_continuous(limits = c(0,quantile(diamonds$carat,0.99))) +
scale_y_continuous(limits = c(0,quantile(diamonds$price,0.99))) +
geom_point(na.rm = TRUE,color = 'black', fill = 'orange', shape = 21)
Response: Non-linear looks more exponential
ggplot(data = diamonds, mapping = aes(x = carat, y = price)) +
scale_x_continuous(limits = c(0,quantile(diamonds$carat,0.99))) +
scale_y_continuous(limits = c(0,quantile(diamonds$price,0.99))) +
geom_point(na.rm = TRUE) + geom_smooth(method = 'lm')
# install these if necessary
# install.packages('GGally')
#install.packages('scales')
#install.packages('memisc')
# install.packages('lattice')
# install.packages('MASS')
# install.packages('car')
# install.packages('reshape')
# install.packages('plyr')
# load the ggplot graphics package and the others
library(ggplot2)
library(GGally)
library(scales)
library(memisc)
# sample 10,000 diamonds from the data set
set.seed(20022012)
diamond_samp <- diamonds[sample(1:length(diamonds$price), 10000), ]
ggpairs(diamond_samp,
lower = list(continuous = wrap("points", shape = I('.'))),
upper = list(combo = wrap("box", outlier.shape = I('.'))))
library(gridExtra)
plot1 <- ggplot(data = diamonds, mapping = aes(x = price)) +
geom_histogram() +
ggtitle('Price')
plot2 <- plot1 + scale_x_log10() + ggtitle('Price (log10)')
grid.arrange(plot1,plot2)
ggplot(data = diamonds, mapping = aes(x = carat, y = price)) +
geom_point() +
scale_y_log10()
cuberoot_trans = function() trans_new('cuberoot', transform = function(x) x^(1/3),
inverse = function(x) x^3)
ggplot(aes(carat, price), data = diamonds) +
geom_point(na.rm = TRUE) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat')
head(sort(table(diamonds$carat),decreasing = T))
##
## 0.3 0.31 1.01 0.7 0.32 1
## 2604 2249 2242 1981 1840 1558
head(sort(table(diamonds$price),decreasing = T))
##
## 605 802 625 828 776 698
## 132 127 126 125 124 121
ggplot(aes(carat, price), data = diamonds) +
geom_point(na.rm = TRUE, alpha = 1/20, size = 3/4, position = 'jitter') +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat')
Notes: Clarity is another feature which might be useful
Alter the code below.
# install and load the RColorBrewer package
# install.packages('RColorBrewer')
library(RColorBrewer)
ggplot(aes(x = carat, y = price), data = diamonds) +
geom_point(na.rm = TRUE, alpha = 0.5, size = 1, position = 'jitter',
mapping = aes(color = clarity)) +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Clarity', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Clarity')
Response: Price does vary by clarity, diamonds with higher clarity have higher prices
Alter the code below.
ggplot(aes(x = carat, y = price, color = clarity), data = diamonds) +
geom_point(na.rm = TRUE, alpha = 0.5, size = 1, position = 'jitter',
mapping = aes(color = cut)) +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Clarity', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Clarity')
Response: Not much variation is explained by coloring using the cut of the diamonds
Alter the code below.
ggplot(data = diamonds, aes(x = carat, y = price, color = color)) +
geom_point(na.rm = TRUE, alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Color', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Cut')
m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data = diamonds)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1, m2, m3, m4, m5)
##
## Calls:
## m1: lm(formula = I(log(price)) ~ I(carat^(1/3)), data = diamonds)
## m2: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat, data = diamonds)
## m3: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut, data = diamonds)
## m4: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color,
## data = diamonds)
## m5: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color +
## clarity, data = diamonds)
##
## ========================================================================================
## m1 m2 m3 m4 m5
## ----------------------------------------------------------------------------------------
## (Intercept) 2.821*** 1.039*** 0.874*** 0.932*** 0.415***
## (0.006) (0.019) (0.019) (0.017) (0.010)
## I(carat^(1/3)) 5.558*** 8.568*** 8.703*** 8.438*** 9.144***
## (0.007) (0.032) (0.031) (0.028) (0.016)
## carat -1.137*** -1.163*** -0.992*** -1.093***
## (0.012) (0.011) (0.010) (0.006)
## cut: .L 0.224*** 0.224*** 0.120***
## (0.004) (0.004) (0.002)
## cut: .Q -0.062*** -0.062*** -0.031***
## (0.004) (0.003) (0.002)
## cut: .C 0.051*** 0.052*** 0.014***
## (0.003) (0.003) (0.002)
## cut: ^4 0.018*** 0.018*** -0.002
## (0.003) (0.002) (0.001)
## color: .L -0.373*** -0.441***
## (0.003) (0.002)
## color: .Q -0.129*** -0.093***
## (0.003) (0.002)
## color: .C 0.001 -0.013***
## (0.003) (0.002)
## color: ^4 0.029*** 0.012***
## (0.003) (0.002)
## color: ^5 -0.016*** -0.003*
## (0.003) (0.001)
## color: ^6 -0.023*** 0.001
## (0.002) (0.001)
## clarity: .L 0.907***
## (0.003)
## clarity: .Q -0.240***
## (0.003)
## clarity: .C 0.131***
## (0.003)
## clarity: ^4 -0.063***
## (0.002)
## clarity: ^5 0.026***
## (0.002)
## clarity: ^6 -0.002
## (0.002)
## clarity: ^7 0.032***
## (0.001)
## ----------------------------------------------------------------------------------------
## R-squared 0.924 0.935 0.939 0.951 0.984
## N 53940 53940 53940 53940 53940
## ========================================================================================
## Significance: *** = p < 0.001; ** = p < 0.01; * = p < 0.05
Notice how adding cut to our model does not help explain much of the variance in the price of diamonds. This fits with out exploration earlier.
Research: (Take some time to come up with 2-4 problems for the model) (You should 10-20 min on this)
#install.package('bitops') # throws an error
#install.packages('RCurl')
#library('bitops')
library('RCurl')
## Loading required package: bitops
#diamondsurl = getBinaryURL("https://raw.github.com/solomonm/diamonds-data/master/BigDiamonds.Rda")
#load(rawConnection(diamondsurl))
load('./BigDiamonds.Rda')
names(diamondsbig)
## [1] "carat" "cut" "color" "clarity"
## [5] "table" "depth" "cert" "measurements"
## [9] "price" "x" "y" "z"
The code used to obtain the data is available here: https://github.com/solomonm/diamonds-data
Notes:
m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data = diamondsbig)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
suppressMessages(library(lattice))
suppressMessages(library(MASS))
suppressMessages(library(memisc))
mtable(m1, m2, m3, m4, m5)
##
## Calls:
## m1: lm(formula = I(log(price)) ~ I(carat^(1/3)), data = diamondsbig)
## m2: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat, data = diamondsbig)
## m3: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut, data = diamondsbig)
## m4: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color,
## data = diamondsbig)
## m5: lm(formula = I(log(price)) ~ I(carat^(1/3)) + carat + cut + color +
## clarity, data = diamondsbig)
##
## =============================================================================================
## m1 m2 m3 m4 m5
## ---------------------------------------------------------------------------------------------
## (Intercept) 3.096*** 1.406*** 1.218*** 0.405*** -0.663***
## (0.002) (0.005) (0.005) (0.006) (0.006)
## I(carat^(1/3)) 5.317*** 7.911*** 7.920*** 8.170*** 8.368***
## (0.002) (0.008) (0.008) (0.007) (0.005)
## carat -0.767*** -0.779*** -0.782*** -0.815***
## (0.002) (0.002) (0.002) (0.001)
## cut: V.Good 0.119*** 0.092*** 0.059***
## (0.002) (0.002) (0.001)
## cut: Ideal 0.256*** 0.222*** 0.130***
## (0.002) (0.001) (0.001)
## color: K/L 0.134*** 0.128***
## (0.004) (0.003)
## color: J/L 0.302*** 0.325***
## (0.004) (0.003)
## color: I/L 0.422*** 0.457***
## (0.003) (0.003)
## color: H/L 0.517*** 0.560***
## (0.003) (0.003)
## color: G/L 0.627*** 0.661***
## (0.003) (0.002)
## color: F/L 0.723*** 0.751***
## (0.003) (0.002)
## color: E/L 0.790*** 0.805***
## (0.003) (0.002)
## color: D/L 0.894*** 0.886***
## (0.003) (0.003)
## clarity: I1 0.355***
## (0.005)
## clarity: SI2 0.684***
## (0.005)
## clarity: SI1 0.834***
## (0.005)
## clarity: VS2 0.979***
## (0.005)
## clarity: VS1 1.067***
## (0.005)
## clarity: VVS2 1.145***
## (0.005)
## clarity: VVS1 1.224***
## (0.005)
## clarity: IF 1.346***
## (0.005)
## ---------------------------------------------------------------------------------------------
## R-squared 0.893 0.911 0.915 0.940 0.968
## N 597311 597311 597311 597311 597311
## =============================================================================================
## Significance: *** = p < 0.001; ** = p < 0.01; * = p < 0.05
Example Diamond from BlueNile: Round 1.00 Very Good I VS1 $5,601
# Be sure you’ve loaded the library memisc and have m5 saved as an object in your workspace.
thisDiamond = data.frame(carat = 1.00, cut = "V.Good",
color = "I", clarity="VS1")
modelEstimate = predict(m5, newdata = thisDiamond,
interval="prediction", level = .95)
dat = data.frame(m4$model, m4$residuals)
with(dat, sd(m4.residuals))
## [1] 0.3176132
with(subset(dat, carat > .9 & carat < 1.1), sd(m4.residuals))
## [1] 0.3668827
dat$resid <- as.numeric(dat$m4.residuals)
ggplot(aes(y = resid, x = round(carat, 2)), data = dat) +
geom_line(stat = "summary", fun.y = sd)